Data Information

This data was sourced from a study conducted by Boudreault et al, cited below. Alternative splicing (AS) is a method of gene regulation in some Eukaryotes which modifies the sequence of RNA transcripts. Alternative Splicing changes the composition of the proteins resulting from the RNA transcripts through differential choice of exons included in mature mRNAs. This results in higher variability and diversity in the cellular proteome.The study from which this dataset is lifted looks into the global changes in RNA splicing that occurs after infection with a human virus.

Study Citation

: Boudreault S, Martenon-Brodeur C, Caron M, Garant JM, Tremblay MP, Armero VE, Durand M, Lapointe E, Thibault P, Tremblay-Létourneau M, Perreault JP, Scott MS, Lemay G, Bisaillon M. Global Profiling of the Cellular Alternative RNA Splicing Landscape during Virus-Host Interactions. PLoS One. 2016 Sep 6;11(9):e0161914.doi: 10.1371/journal.pone.0161914. PMID: 27598998; PMCID: PMC5012649. ________________________________________________________________________________

Setup

This data analysis uses additional packages beyond base R, which includes the package manager pacman.

knitr::opts_chunk$set(echo = TRUE,
                      # warning = FALSE,
                      # message = FALSE,n
                      fig.align = "center",
                      fig.height = 8)

#install.packages('pacman')
#install.packages(c("BiocManager", "RColorBrewer", "tidyverse", "devtools", "pheatmap", "gprofiler2"))
#install.packages("genekitr")
#install.packages("biomaRt")

#BiocManager::install(c("DESeq2", "org.Hs.eg.db", "EnhancedVolcano", "hypeR"))

pacman::p_load(dplyr, ggplot2, DESeq2, tidyverse, forcats, 
               readr, usedist, gridExtra, DESeq2, RColorBrewer, pheatmap,
               genekitr, clusterProfiler, EnhancedVolcano, ggfortify, biomartr,
               hypeR, gprofiler2, BiocManager)

#BiocManager::install(c("DESeq2", "org.Hs.eg.db", "EnhancedVolcano", "hypeR", "biomaRt"))


theme_set(theme_bw())

Bringing in the metadata_SRP074247.tsv file

This is reading in the metadata into a data frame and doing some preliminary data cleaning, which includes separating values and using the janitor function to clean up the metadata.

meta <- read.table(file = './Data/SRP074247/metadata_SRP074247.tsv',
           sep = '\t',
           header = TRUE)

janitor::clean_names(meta)
##   refinebio_accession_code experiment_accession refinebio_age
## 1               SRR3471108            SRP074247            NA
## 2               SRR3471109            SRP074247            NA
## 3               SRR3471110            SRP074247            NA
## 4               SRR3471111            SRP074247            NA
## 5               SRR3471112            SRP074247            NA
## 6               SRR3471113            SRP074247            NA
## 7               SRR3471114            SRP074247            NA
## 8               SRR3471115            SRP074247            NA
## 9               SRR3471116            SRP074247            NA
##   refinebio_cell_line refinebio_compound refinebio_developmental_stage
## 1                l929                 NA                            NA
## 2                l929                 NA                            NA
## 3                l929                 NA                            NA
## 4                l929                 NA                            NA
## 5                l929                 NA                            NA
## 6                l929                 NA                            NA
## 7                l929                 NA                            NA
## 8                l929                 NA                            NA
## 9                l929                 NA                            NA
##   refinebio_disease refinebio_disease_stage refinebio_genetic_information
## 1                NA                      NA                            NA
## 2                NA                      NA                            NA
## 3                NA                      NA                            NA
## 4                NA                      NA                            NA
## 5                NA                      NA                            NA
## 6                NA                      NA                            NA
## 7                NA                      NA                            NA
## 8                NA                      NA                            NA
## 9                NA                      NA                            NA
##   refinebio_organism                      refinebio_platform
## 1       MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 2       MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 3       MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 4       MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 5       MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 6       MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 7       MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 8       MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
## 9       MUS_MUSCULUS Illumina HiSeq 2000 (IlluminaHiSeq2000)
##   refinebio_processed refinebio_processor_id refinebio_processor_name
## 1                True                    381                 Tximport
## 2                True                    381                 Tximport
## 3                True                    381                 Tximport
## 4                True                    381                 Tximport
## 5                True                    381                 Tximport
## 6                True                    381                 Tximport
## 7                True                    381                 Tximport
## 8                True                    381                 Tximport
## 9                True                    381                 Tximport
##   refinebio_processor_version refinebio_race refinebio_sex
## 1              v1.25.8-hotfix             NA            NA
## 2              v1.25.8-hotfix             NA            NA
## 3              v1.25.8-hotfix             NA            NA
## 4              v1.25.8-hotfix             NA            NA
## 5              v1.25.8-hotfix             NA            NA
## 6              v1.25.8-hotfix             NA            NA
## 7              v1.25.8-hotfix             NA            NA
## 8              v1.25.8-hotfix             NA            NA
## 9              v1.25.8-hotfix             NA            NA
##   refinebio_source_archive_url refinebio_source_database
## 1                           NA                       SRA
## 2                           NA                       SRA
## 3                           NA                       SRA
## 4                           NA                       SRA
## 5                           NA                       SRA
## 6                           NA                       SRA
## 7                           NA                       SRA
## 8                           NA                       SRA
## 9                           NA                       SRA
##                         refinebio_specimen_part refinebio_subject
## 1 c3h/an mouse conjunctive tissue (fibroblasts)    l929 cell line
## 2 c3h/an mouse conjunctive tissue (fibroblasts)    l929 cell line
## 3 c3h/an mouse conjunctive tissue (fibroblasts)    l929 cell line
## 4 c3h/an mouse conjunctive tissue (fibroblasts)    l929 cell line
## 5 c3h/an mouse conjunctive tissue (fibroblasts)    l929 cell line
## 6 c3h/an mouse conjunctive tissue (fibroblasts)    l929 cell line
## 7 c3h/an mouse conjunctive tissue (fibroblasts)    l929 cell line
## 8 c3h/an mouse conjunctive tissue (fibroblasts)    l929 cell line
## 9 c3h/an mouse conjunctive tissue (fibroblasts)    l929 cell line
##   refinebio_time                                  refinebio_title
## 1             NA                          Mock Cells, Replicate 1
## 2             NA                          Mock Cells, Replicate 2
## 3             NA                          Mock Cells, Replicate 3
## 4             NA        Cells infected with Reovirus, replicate 1
## 5             NA        Cells infected with Reovirus, replicate 2
## 6             NA        Cells infected with Reovirus, replicate 3
## 7             NA Cells infected with Mutant Reovirus, replicate 1
## 8             NA Cells infected with Mutant Reovirus, replicate 2
## 9             NA Cells infected with Mutant Reovirus, replicate 3
##   refinebio_treatment meta_sra_age
## 1                  NA          100
## 2                  NA          100
## 3                  NA          100
## 4                  NA          100
## 5                  NA          100
## 6                  NA          100
## 7                  NA          100
## 8                  NA          100
## 9                  NA          100

Clean Meta file

It is important to clean the metadata so the data analysis is accurate and streamlined. meta_clean is a cleaned version of the meta data frame. The treatment variables we are interested in are Mock, Reovirus, and Mutant. The names in meta were cleaned to only include the treatment type and the replicate number. First the treatment type and replicate numbers are separated out into their own columns, and then recombined for a cleaner view of what each column is telling us.

meta_clean <-
  
meta |>
  # Changing columns to choose based on how our dataset is stored
  mutate('sample_ID' = refinebio_accession_code,
         'title' = refinebio_title,
         'treatment' = refinebio_title,
         'replicate' = refinebio_title,
         )

for(i in 1:nrow(meta_clean)){
  if(grepl('Mock', meta_clean$title[i])){
    meta_clean$treatment[i] <- 'uninfected'}
  if(grepl('with Reovirus', meta_clean$title[i])){
    meta_clean$treatment[i] <- 'infected'}
  if(grepl('Mutant', meta_clean$title[i])){
    meta_clean$treatment[i] <- 'mutant'}
}

for(i in 1:nrow(meta_clean)){
  if(grepl('1', meta_clean$title[i])){
    meta_clean$replicate[i] <- '1'}
  if(grepl('2', meta_clean$title[i])){
    meta_clean$replicate[i] <- '2'}
  if(grepl('3', meta_clean$title[i])){
    meta_clean$replicate[i] <- '3'}
}

meta_clean <-
  meta_clean |>
  filter(treatment %in% c('uninfected', 'infected')) |>
  mutate('sample' = paste(treatment, replicate, sep = "_")) |>
  select(sample_ID, sample, title, treatment, replicate) |>
  janitor::clean_names()

meta_clean
##    sample_id       sample                                     title  treatment
## 1 SRR3471108 uninfected_1                   Mock Cells, Replicate 1 uninfected
## 2 SRR3471109 uninfected_2                   Mock Cells, Replicate 2 uninfected
## 3 SRR3471110 uninfected_3                   Mock Cells, Replicate 3 uninfected
## 4 SRR3471111   infected_1 Cells infected with Reovirus, replicate 1   infected
## 5 SRR3471112   infected_2 Cells infected with Reovirus, replicate 2   infected
## 6 SRR3471113   infected_3 Cells infected with Reovirus, replicate 3   infected
##   replicate
## 1         1
## 2         2
## 3         3
## 4         1
## 5         2
## 6         3

Counts Matrix

The counts matrix is a data frame which contains the actual data output from the RNA-seq experiment, and will be used for the differential gene expression analysis.

Reading in the counts matrix

# Read in counts matrix
data <- read.table(file = './Data/SRP074247/SRP074247.tsv',
           sep = '\t',
           header = TRUE,
           row.names = 1) |>
  select(-c(SRR3471114, SRR3471115, SRR3471116))

head(data)
##                    SRR3471108   SRR3471109   SRR3471110  SRR3471111
## ENSMUSG00000000001 19889.8320 19485.824000 15567.427000 16893.19700
## ENSMUSG00000000003     0.0000     0.000000     0.000000     0.00000
## ENSMUSG00000000028   966.2573  1009.518100   790.992900   544.20966
## ENSMUSG00000000031     0.0000     0.000000     0.000000     0.00000
## ENSMUSG00000000037   186.0445   196.689480   166.312350    73.05573
## ENSMUSG00000000049     0.0000     1.311085     0.703601     4.89068
##                      SRR3471112  SRR3471113
## ENSMUSG00000000001 16797.210000 13591.75300
## ENSMUSG00000000003     0.000000     0.00000
## ENSMUSG00000000028   695.027340   492.18677
## ENSMUSG00000000031     0.000000     0.00000
## ENSMUSG00000000037    78.764824    76.60153
## ENSMUSG00000000049     1.354603     0.00000
identical(colnames(data), meta_clean$sample_id)
## [1] TRUE

This boolean flag checks that the cleaned data matches the original data, which it does.

Changing the Column Names

This is renaming the columns created above for treatment type (infected or uninfected) and replicate number on the counts matrix

colnames(data) <- paste(meta_clean$treatment, meta_clean$replicate, sep = "_")

tibble(data)
## # A tibble: 41,249 × 6
##    uninfected_1 uninfected_2 uninfected_3 infected_1 infected_2 infected_3
##           <dbl>        <dbl>        <dbl>      <dbl>      <dbl>      <dbl>
##  1       19890.     19486.      15567.      16893.     16797.      13592. 
##  2           0          0           0           0          0           0  
##  3         966.      1010.        791.        544.       695.        492. 
##  4           0          0           0           0          0           0  
##  5         186.       197.        166.         73.1       78.8        76.6
##  6           0          1.31        0.704       4.89       1.35        0  
##  7         831.       830.        617.        254.       213.        181. 
##  8        3131.      3077.       2470.       2406.      2472.       2053. 
##  9        5645.      5921.       4623.      18342.     19434.      15835. 
## 10         714.       843.        781.        675.       706.        594. 
## # ℹ 41,239 more rows

This is adding the gene name to each row

data_clean <- rownames_to_column(data, 'gene')

tibble(data_clean)
## # A tibble: 41,249 × 7
##    gene  uninfected_1 uninfected_2 uninfected_3 infected_1 infected_2 infected_3
##    <chr>        <dbl>        <dbl>        <dbl>      <dbl>      <dbl>      <dbl>
##  1 ENSM…       19890.     19486.      15567.      16893.     16797.      13592. 
##  2 ENSM…           0          0           0           0          0           0  
##  3 ENSM…         966.      1010.        791.        544.       695.        492. 
##  4 ENSM…           0          0           0           0          0           0  
##  5 ENSM…         186.       197.        166.         73.1       78.8        76.6
##  6 ENSM…           0          1.31        0.704       4.89       1.35        0  
##  7 ENSM…         831.       830.        617.        254.       213.        181. 
##  8 ENSM…        3131.      3077.       2470.       2406.      2472.       2053. 
##  9 ENSM…        5645.      5921.       4623.      18342.     19434.      15835. 
## 10 ENSM…         714.       843.        781.        675.       706.        594. 
## # ℹ 41,239 more rows

Pivoting the data frame

Initially, the read matrix had one row for every gene, and each column was the read count for each treatment and replicate. By pivoting the data, we now are able to have more than one row for each gene (there are now nine rows for each gene, as there are nine sample treatment types), and the columns are divided by sample, treatment, replicate, gene, and reads. This way, the data is easier to assess and the data frame contains more information than just the read counts.

data_long <-
  pivot_longer(data = data_clean,
               cols = -gene,
               names_to = 'ID',
               values_to = 'reads') |>
  
  mutate('sample' = ID) |>

  separate(col = 'ID',
           into = c('treatment', 'replicate'),
           sep = '_') |>
  
  mutate(treatment = as.factor(treatment),
         replicate = as.factor(replicate)) |>
  
  select(sample, treatment, replicate, gene, reads)

tibble(data_long)
## # A tibble: 247,494 × 5
##    sample       treatment  replicate gene                reads
##    <chr>        <fct>      <fct>     <chr>               <dbl>
##  1 uninfected_1 uninfected 1         ENSMUSG00000000001 19890.
##  2 uninfected_2 uninfected 2         ENSMUSG00000000001 19486.
##  3 uninfected_3 uninfected 3         ENSMUSG00000000001 15567.
##  4 infected_1   infected   1         ENSMUSG00000000001 16893.
##  5 infected_2   infected   2         ENSMUSG00000000001 16797.
##  6 infected_3   infected   3         ENSMUSG00000000001 13592.
##  7 uninfected_1 uninfected 1         ENSMUSG00000000003     0 
##  8 uninfected_2 uninfected 2         ENSMUSG00000000003     0 
##  9 uninfected_3 uninfected 3         ENSMUSG00000000003     0 
## 10 infected_1   infected   1         ENSMUSG00000000003     0 
## # ℹ 247,484 more rows
colors <- c('infected' = '#FA8633',
            'uninfected' = 'lightseagreen')

Checking the quality of the dataset

For gene expression profiling experiments, we hope for between 10-25 million reads per sample. Below is a bar plot showing the sum of the total reads for each sample

We must find if there is a significant different in the total counts by treatment

sum_by_sample <-
  data_long |> 
  group_by(treatment, sample) |> 
  summarize('sum' = sum(reads)) |>
  ungroup()
## `summarise()` has grouped output by 'treatment'. You can override using the
## `.groups` argument.
ggplot(data = data_long,
       mapping = aes(x = sample)) +
          
  geom_col(data = sum_by_sample,
           mapping = aes(y = sum,
                         fill = treatment),
           color = 'black') +
  
  labs(x = 'Sample',
       y = 'Total Reads',
       title = 'Assessing the quality of the dataset using number of reads',
       caption = 'Data from refine.bio\nAccession ID: SRP074247',
       fill = 'Sample type') +
  
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 16),
        axis.text.x = element_text(angle = 90, 
                                   vjust = 0.5, 
                                   hjust = 1),
        axis.title.y = element_text(size = 14),
        axis.title.x = element_text(size = 14),
        legend.position = 'right',
        plot.caption = element_text(face = 'italic')) +
  
  scale_fill_manual(values = colors) +
  
  scale_y_continuous(expand = c(0.01, 0))

Figure 1. A barplot assessing the quality of the dataset used. For each gene expression profiling experiment, a good quality dataset will have 10-25 million reads per sample. As shown in the plot, all six samples are within the range desired.

ggplot(data = sum_by_sample |> group_by(treatment) |> summarize('sum' = sum(sum)),
       mapping = aes(x = sum, 
                     y = treatment,
                     fill = treatment)) +
  
  geom_boxplot(data = data_long |> group_by(reads),
               mapping = aes(x = log2(reads))) +
  
  
  scale_fill_manual(values = colors) +
  
  labs(title = 'Number of reads by sample type',
       x = 'log2(reads)',
       y = NULL,
       fill = 'Sample type') +
  
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 16))
## Warning: Removed 91153 rows containing non-finite values (`stat_boxplot()`).

Figure 2. A Boxplot of the number of reads by sample type. It is important to analyze the read counts between infected vs uninfected because if there are statistical differences, this will impact downstream biological interpretation.

Evalutating statistical significance

We are conducting a T-Test to evaluate whether or not there is a statistical difference in counts by treatment. We are hoping for a high p-value which will indicate no statistical significance in the difference in read counts by treatment.

Rather than conducting a T-test to establish statistical significance, we will use Analysis of variance (ANOVA). This is because we have 3 treatment variables, and a T test is only able to analyze significance between 2 variables.

t.test(colSums(data) ~ meta_clean$treatment)
## 
##  Welch Two Sample t-test
## 
## data:  colSums(data) by meta_clean$treatment
## t = 1.2091, df = 3.9817, p-value = 0.2935
## alternative hypothesis: true difference in means between group infected and group uninfected is not equal to 0
## 95 percent confidence interval:
##  -3614589  9173758
## sample estimates:
##   mean in group infected mean in group uninfected 
##                 31201792                 28422207

The p-value is .2935, so there is not a significant difference between total number of counts by treatment group.

Histogram of reads by gene

sum_by_gene <-
  data_long |> 
  group_by(gene) |> 
  summarize('sum' = sum(reads)) |>
  ungroup()

ggplot(data = sum_by_gene,
       mapping = aes(x = log2(sum))) +
  
  geom_histogram(bins = 15,
                 color = 'black',
                 fill = '#9A6DA3') +
  
  labs(x = 'Log-2 transformed counts per gene',
       y = 'Number of genes',
       title = 'Number of reads by gene') +
  
   theme(plot.title = element_text(hjust = 0.5,
                                  size = 16))
## Warning: Removed 9696 rows containing non-finite values (`stat_bin()`).

Figure 3. A histogram of the sum number of reads across each gene. This histogram is including low-abundance gene. There are six samples worth of read counts included in this histogram.

Excluding low abundance genes

high_abun_genes <-
  data_long |>
  pivot_wider(names_from = gene,
              values_from = reads) |>
  select(where(~ all(. != 0)))

high_abun_long <- 
  high_abun_genes |>
  pivot_longer(cols = -c(sample, treatment, replicate),
               names_to = 'gene',
               values_to = 'reads')

high_abun_sum <-
  high_abun_long |>
  group_by(gene) |> 
  summarize('sum' = sum(reads)) |>
  ungroup()

ggplot(data = high_abun_sum,
       mapping = aes(x = log2(sum))) +
  
  geom_histogram(bins = 15,
                 color = 'black',
                 fill = '#9A6DA3') +
  
  labs(x = 'Log-2 transformed counts per gene',
       y = 'Number of genes',
       title = 'Number of reads by gene (excluding low abundance genes)') +
  
   theme(plot.title = element_text(hjust = 0.5,
                                  size = 16))

Figure 4. A histogram of the sum number of reads across each gene. This histogram is excluding low abundance genes, or genes with read counts of zero. There are six samples worth of read counts included in this histogram.

non_normal_graph <-
  
ggplot(data = high_abun_long,
       mapping = aes(x = sample)) +
          
  geom_boxplot(mapping = aes(y = log2(reads),
                             fill = treatment),
              color = 'black') +
  
  labs(x = 'Sample ID',
       y = 'Log 2 transformed counts per gene',
       title = 'Non-normalized counts',
       caption = 'Data from refine.bio\nAccession ID: SRP074247',
       fill = 'Sample type') +
  
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 16),
        axis.text.x = element_text(angle = 90, 
                                   vjust = 0.5, 
                                   hjust = 1),
        axis.title.y = element_text(size = 14),
        axis.title.x = element_text(size = 14),
        legend.position = 'right',
        plot.caption = element_text(face = 'italic')) +
  
  scale_fill_manual(values = colors) +
  
  scale_y_continuous(expand = c(0.01, 0))

non_normal_graph

Figure 5. A non-normalized boxplot of the sum number of reads across each gene. The boxplot provides insight into the distribution of read counts per sample.

sample_names <- c('uninfected_1', 'uninfected_2', 'uninfected_3', 'infected_1', 'infected_2', 'infected_3')
high_abun_log2 <-
  high_abun_long |> 
  mutate(reads = log2(reads)) |>
  pivot_wider(names_from = gene,
              values_from = reads)

high_abun_log2_flipped <-
  data.frame(t(high_abun_log2 |> select(-c(treatment, replicate)))) |> janitor::row_to_names(row_number = 1) |>
  mutate_all(function(x) as.numeric(as.character(x)))


data_dist <- dist(high_abun_log2, method = "euclidean")
## Warning in dist(high_abun_log2, method = "euclidean"): NAs introduced by
## coercion
data_dist <- dist_setNames(data_dist, sample_names)


plot(hclust(data_dist, method = "complete"), xlab = "Sample", main = NULL)

Figure 6. A dendrogram comparing the similarity of read count data across the different samples based on non-normalized data. The three infected samples are most similar to one another, and the three uninfected samples are most similar to one another, which is to be expected.

Normalization

Generating correction factors for the data where the median of each row is subtracted from the mean of the row. We then apply this correction factor to each sample.

sample_medians <- (apply(high_abun_log2_flipped, 2, median))

grand_median <- mean(sample_medians)

correction_factors <- grand_median - sample_medians

corrections <- data.frame(correction_factors)

corrections
##              correction_factors
## uninfected_1         -0.1308265
## uninfected_2         -0.1311785
## uninfected_3          0.1408655
## infected_1           -0.0419730
## infected_2           -0.0345170
## infected_3            0.1976295
norm_data <- high_abun_log2_flipped

for(i in 1:ncol(norm_data)){
  norm_data[,i] <- high_abun_log2_flipped[,i] + corrections$correction_factors[i]
}
norm_data_long <-
  norm_data |> rownames_to_column('gene') |>
  pivot_longer(cols = -gene,
               names_to = 'ID',
               values_to = 'normalized_reads') |>
  
  mutate('sample' = ID) |>

  separate(col = 'ID',
           into = c('treatment', 'replicate'),
           sep = '_') |>
  
  mutate(treatment = as.factor(treatment),
         replicate = as.factor(replicate)) |>
  
  select(sample, treatment, replicate, gene, normalized_reads)
normal_graph <-
  
ggplot(data = norm_data_long,
       mapping = aes(x = sample)) +
          
  geom_boxplot(mapping = aes(y = normalized_reads,
                             fill = treatment),
              color = 'black') +
  
  labs(x = 'Sample ID',
       y = 'Log 2 transformed counts per gene',
       title = 'Normalized counts',
       caption = 'Data from refine.bio\nAccession ID: SRP074247',
       fill = 'Sample type') +
  
  theme(plot.title = element_text(hjust = 0.5,
                                  size = 16),
        axis.text.x = element_text(angle = 90, 
                                   vjust = 0.5, 
                                   hjust = 1),
        axis.title.y = element_text(size = 14),
        axis.title.x = element_text(size = 14),
        legend.position = 'right',
        plot.caption = element_text(face = 'italic')) +
  
  scale_fill_manual(values = colors) +
  
  scale_y_continuous(expand = c(0.01, 0))

normal_graph

Figure 7. A normalized boxplot of the sum number of reads across each gene. The boxplot provides insight into the distribution of read counts per sample.

grid.arrange(non_normal_graph, normal_graph, ncol=2)

Figure 8. A side by side comparison of the non-normalized vs normalized counts boxplots, to see what difference the normalization made in the figure outcome.

norm_log2 <-
  norm_data_long |>
  mutate(normalized_reads = log2(normalized_reads)) |>
  pivot_wider(names_from = gene,
              values_from = normalized_reads)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `normalized_reads = log2(normalized_reads)`.
## Caused by warning:
## ! NaNs produced
norm_dist <- dist(norm_log2, method = "euclidean")
## Warning in dist(norm_log2, method = "euclidean"): NAs introduced by coercion
norm_dist <- dist_setNames(norm_dist, sample_names)

plot(hclust(norm_dist, method = "complete"), xlab = "Sample", main = NULL)

Figure 9. A dendrogram comparing the similarity of read count data across the different samples based on normalized data. The three infected samples are most similar to one another, and the three uninfected samples are most similar to one another, which is to be expected.

Principal Component Analysis

Principle Component Analysis is a technique for analyzing large datasets with multiple features to be considered. By using PCA, one is able to maximize interpretability while maintaining the maximum amount of information. T

PCA <- prcomp(t(norm_dist))

autoplot(PCA, data = data_long, color = 'treatment', main = "Colored by Treatment", size = 3)
## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded

## Warning in data.frame(..., check.names = FALSE): row names were found from a
## short variable and have been discarded

Figure 10. PCA which colors by treatment. This PCA1 gives a percentage of variance of 42.59%, which indicates 42.59% of the variance in the data can be explained based on this variable, and PCA2 gives a percentage of variance of 19.93%. These variables are the two strongest sources of variance in the sample set.

Creating a DESeq Object

data_round <- 
  round(data.matrix(data))

#Make sure order is correct
head(data_round,2)
##                    uninfected_1 uninfected_2 uninfected_3 infected_1 infected_2
## ENSMUSG00000000001        19890        19486        15567      16893      16797
## ENSMUSG00000000003            0            0            0          0          0
##                    infected_3
## ENSMUSG00000000001      13592
## ENSMUSG00000000003          0
meta_clean[, c('sample')]
## [1] "uninfected_1" "uninfected_2" "uninfected_3" "infected_1"   "infected_2"  
## [6] "infected_3"
dds <- DESeqDataSetFromMatrix(countData = data_round,           
                                       colData = meta_clean,
                                       design = ~ treatment)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors

Pre-filtering DDS

By pre-filtering the DESeq2 functions, we are able to eliminate rows with very few reads and therefore reduce the memory used by the dds object. This improves speed and visualization quality.

# pre-filter to remove low-read rows
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]

We must choose a reference level for factors based on treatment type.

dds$treatment <- relevel(dds$treatment, ref = "uninfected")

Running Differential Expression Analysis

# run Differential Expression Analysis
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res <- results(dds)

head(res)
## log2 fold change (MLE): treatment infected vs uninfected 
## Wald test p-value: treatment infected vs uninfected 
## DataFrame with 6 rows and 6 columns
##                     baseMean log2FoldChange     lfcSE      stat      pvalue
##                    <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSMUSG00000000001 16796.973      -0.185684 0.0467589  -3.97109 7.15435e-05
## ENSMUSG00000000028   737.681      -0.647278 0.1004754  -6.44215 1.17792e-10
## ENSMUSG00000000037   127.985      -1.227260 0.1929378  -6.36091 2.00559e-10
## ENSMUSG00000000056   477.308      -1.780200 0.1182042 -15.06039 2.95050e-51
## ENSMUSG00000000058  2565.718      -0.291981 0.0559741  -5.21637 1.82468e-07
## ENSMUSG00000000078 11555.826       1.759546 0.0458610  38.36696 0.00000e+00
##                           padj
##                      <numeric>
## ENSMUSG00000000001 2.38709e-04
## ENSMUSG00000000028 6.52838e-10
## ENSMUSG00000000037 1.09498e-09
## ENSMUSG00000000056 7.59753e-50
## ENSMUSG00000000058 7.89931e-07
## ENSMUSG00000000078 0.00000e+00
print('Dimensions:')
## [1] "Dimensions:"
dim(res)
## [1] 23799     6
vsd <- vst(dds, blind=FALSE)

plotPCA(vsd, intgroup=c("treatment")) + theme_classic() + scale_color_manual(values = colors)
## using ntop=500 top features by variance

Figure 11. Principal Component Analysis using variance stabilizing transformation (vst). This data is transformed on a log2 scale and then normalized with respect of library size. The transformation is not blind because the experiment design has a large difference in counts between the treatment types, and blind dispersion estimates wil, overly shrink the values towards one another. This plot demonstrates a PCA based on the first two principle components of the data.

Hierarchical Clustering Heatmap

rld <- rlog(dds, blind=FALSE)
rld_mat <- assay(rld)  
rld_cor <- cor(rld_mat) 

heat.colors <- brewer.pal(6, "Blues")
pheatmap(rld_cor, color = heat.colors,
         breaks = c(0.97, 0.99, 0.995, 0.999, 0.9995, 0.99995))

Figure 12. A hierarchical clustering heatmap of the similarities between the read counts in each sample. This heatmap is comparing the similarity of each sample to each other sample, and the darker blue the sample is, the more similar the two are to one another. The uninfected samples are most similar to one another, with uninfected_1 and uninfected_2 being the most similar to each other. The infected samples are each similarly different to one another, which is indicated in the lightest blue shade.

Tabular results

infected_n_res <- results(dds, contrast = c("treatment", "infected", "uninfected"))
infected_n_sigs <- na.omit(infected_n_res)
infected_n_sigs <- subset(infected_n_sigs, padj < 0.05 & abs(log2FoldChange) > 1)
infected_n_sig_data <- merge(data.frame(infected_n_sigs),
                        data.frame(counts(dds, normalized = TRUE)), 
                        by = "row.names", sort=FALSE)

names(infected_n_sig_data)[1] <- "Ensembl_ID"

infected_n_res_data <- merge(data.frame(infected_n_res),
                        data.frame(counts(dds, normalized = TRUE)),
                        by="row.names", sort=FALSE)

names(infected_n_res_data)[1] <- "Ensembl_ID"  

head(res)
## log2 fold change (MLE): treatment infected vs uninfected 
## Wald test p-value: treatment infected vs uninfected 
## DataFrame with 6 rows and 6 columns
##                     baseMean log2FoldChange     lfcSE      stat      pvalue
##                    <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSMUSG00000000001 16796.973      -0.185684 0.0467589  -3.97109 7.15435e-05
## ENSMUSG00000000028   737.681      -0.647278 0.1004754  -6.44215 1.17792e-10
## ENSMUSG00000000037   127.985      -1.227260 0.1929378  -6.36091 2.00559e-10
## ENSMUSG00000000056   477.308      -1.780200 0.1182042 -15.06039 2.95050e-51
## ENSMUSG00000000058  2565.718      -0.291981 0.0559741  -5.21637 1.82468e-07
## ENSMUSG00000000078 11555.826       1.759546 0.0458610  38.36696 0.00000e+00
##                           padj
##                      <numeric>
## ENSMUSG00000000001 2.38709e-04
## ENSMUSG00000000028 6.52838e-10
## ENSMUSG00000000037 1.09498e-09
## ENSMUSG00000000056 7.59753e-50
## ENSMUSG00000000058 7.89931e-07
## ENSMUSG00000000078 0.00000e+00

Gene ID Conversion

We are converting gene IDs from Ensemble ID to symbols for a clearer picture

ids <- as.character(infected_n_res_data$Ensembl_ID)

gene_list <- biomaRt::getBM(filter = 'ensembl_gene_id',
                  attributes = c("ensembl_gene_id","uniprot_gn_symbol"), 
                  values = ids,
                  mart = biomaRt::useMart(biomart="ensembl", dataset="mmusculus_gene_ensembl"))

Adding the new symbols to the dataset

infected_n_res_data <- merge(infected_n_res_data, gene_list, by.x="Ensembl_ID", by.y="ensembl_gene_id")

infected_n_sig_data <- merge(infected_n_sig_data, gene_list, by.x="Ensembl_ID", by.y="ensembl_gene_id")
#output tabular files
write.csv(infected_n_res_data, 
          quote = FALSE,
          file = "infected_vs_uninfected_normalized_matrix.csv")

#infected_n_sig_data <- subset(infected_n_res_data_gene, 
                                 # padj < 0.05 & abs(log2FoldChange) > 1)

write.csv(infected_n_sig_data, 
          quote = FALSE,
          file = "infected_vs_uninfected_deg_padj0.05_log2fc1.csv")

Volcano Plot

EnhancedVolcano(infected_n_res_data,
                lab = as.character(infected_n_res_data$uniprot_gn_symbol),
                x = 'log2FoldChange',
                y = 'padj', 
                title = "Infected Versus Uninfected Controls",
                subtitle = "Differential expression",
                caption = paste0("Upregulated = 330, Downregulated = 222"),
                xlim = c(-10,10),
                ylim = c(0,20),
                FCcutoff = 1,
                pCutoff = 0.05,
                labSize = 4, 
                axisLabSize = 12, 
                col=c('grey40', 'grey40', 'grey40', '#FA8633'),
                legendLabels=c('Not sig.','Log2FC','padj', 'padj & Log2FC'),
                legendPosition = 'right',
                legendLabSize = 10,
                legendIconSize = 5.0,
                gridlines.major = FALSE,
                gridlines.minor = FALSE)
## Warning: One or more p-values is 0. Converting to 10^-1 * current lowest
## non-zero p-value...

Figure 13. A Volcano plot of the infected samples vs uninfected samples. Log transformed adjusted p-values are plotted on the y-axis while the log2 fold change is plotted on the x-axis.

# plotting upregulated gene

d <- plotCounts(dds, gene="ENSMUSG00000024678", intgroup="treatment", returnData=TRUE)

# Plotting the MOV10 normalized counts, using the samplenames (rownames of d as labels)
ggplot(d, aes(x = treatment, y = count, color = treatment)) + 
  geom_point(position=position_jitter(w = 0.1,h = 0),
             size = 2.5) +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_manual(values = colors) +
  labs(title = "Ms4a4d",
       x = 'Treatment',
       y = 'Count')

Figure 14. Demonstration of up-regulation using Ms4a4d gene. There is much higher gene expression in the infected samples compared to the uninfected sample, indicating this gene’s expression is impacted by virus-host interactions.

# plotting downregulated gene
d <- plotCounts(dds, gene="ENSMUSG00000056328", intgroup="treatment", returnData=TRUE)

# Plotting the MOV10 normalized counts, using the samplenames (rownames of d as labels)
ggplot(d, aes(x = treatment, y = count, color = treatment)) + 
  geom_point(position=position_jitter(w = 0.1,h = 0),
             size = 2.5) +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_color_manual(values = colors) +
  labs(title = "Myh1",
       x = 'Treatment',
       y = 'Count')

Figure 14. Demonstration of down-regulation using Myh1 gene. There is much higher gene expression in the uninfected samples compared to the uninfected sample, indicating this gene’s expression is impacted by virus-host interactions.

Pathway Analysis

Gathering the top 20 differentially expressed genes in the dataset.

## Order results by padj values
top20_sigOE_genes <- infected_n_res_data |>
        arrange(padj) |>    #Arrange rows by padj values
        pull(uniprot_gn_symbol) |>      #Extract character vector of ordered genes
        head(n=20)      #Extract the first 20 genes

## normalized counts for top 20 significant genes
top20_sigOE_norm <- infected_n_sig_data |>
        filter(uniprot_gn_symbol %in% top20_sigOE_genes)

# Gathering the columns to have normalized counts to a single column
gathered_top20_sigOE <- top20_sigOE_norm |>
  gather(colnames(top20_sigOE_norm)[8:13], key = "sample", value = "normalized_counts")

## check the column header in the "gathered" data frame
head(gathered_top20_sigOE)
##           Ensembl_ID  baseMean log2FoldChange      lfcSE     stat pvalue padj
## 1 ENSMUSG00000000078 11555.826       1.759546 0.04586097 38.36696      0    0
## 2 ENSMUSG00000000275 12527.828       3.706646 0.07581458 48.89094      0    0
## 3 ENSMUSG00000000386 15980.147      10.206198 0.16649298 61.30107      0    0
## 4 ENSMUSG00000001123  3666.440       3.923118 0.06682685 58.70572      0    0
## 5 ENSMUSG00000002307  4991.193       3.812616 0.08460128 45.06570      0    0
## 6 ENSMUSG00000002325  1925.976       3.598447 0.07621423 47.21490      0    0
##   uniprot_gn_symbol       sample normalized_counts
## 1              Klf6 uninfected_1        5252.72999
## 2            Trim25 uninfected_1        1903.82384
## 3               Mx1 uninfected_1          37.22041
## 4            Lgals9 uninfected_1         443.85336
## 5              Daxx uninfected_1         551.79254
## 6              Irf9 uninfected_1         303.34632
gathered_top20_sigOE <- inner_join(meta_clean, gathered_top20_sigOE)
## Joining with `by = join_by(sample)`
## plot using ggplot2

ggplot(data = gathered_top20_sigOE,
       mapping = aes(x = as.character(uniprot_gn_symbol), y = normalized_counts, color = treatment)) +
  geom_point() +
  scale_y_log10() +
  xlab("Gene") +
  ylab("log10 Normalized Counts") +
  ggtitle("Top 20 Significant Differentially Expressed Genes") +
  theme_bw() +
  scale_color_manual(values = colors) +
      
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(plot.title = element_text(hjust = 0.5))

Figure 15. Plot of the normalized read counts of the top 20 most significant differentially expressed genes in the dataset.

To determine whether certain items are over or underrepresented, we are using a hypergeometric test to determine the probability of having our observed proportion of genes. The KEGG database is used to compare the data against a manually curated database.

KEGG <- msigdb_gsets(species="Mus musculus", 
                     category="C2", 
                     subcategory="CP:KEGG",
                     clean = TRUE)
hyp_kegg <- hypeR(gene_list$uniprot_gn_symbol, 
                  KEGG, 
                  test="hypergeometric", 
                  background=50000, 
                  fdr=0.05, 
                  plotting=TRUE)
hyp_dots(hyp_kegg, 
         title="KEGG", 
         abrv=30, 
         val = "fdr")

Figure 16. Dot plot of the top enriched genesets from the KEGG database. The color of the dot is indicative of its significance, and its size is indicative of its geneset size.

Hallmark Pathways

Hallmark set summarized the information across multiple genesets, and reduced redundancy.

HALLMARK <- msigdb_gsets("Mus musculus", 
                         "H", 
                         "",
                         clean = TRUE)

# Hallmark
hyp_Hallmark <- hypeR(gene_list$uniprot_gn_symbol, 
                      HALLMARK, 
                      test="hypergeometric", 
                      background=50000, 
                      fdr=0.05, 
                      plotting=TRUE)

#show top pathway
hyp_dots(hyp_Hallmark, 
         title="HALLMARK", 
         abrv=50, 
         val = "fdr")

Figure 17. Dot Plot based on the Hallmark database. The color is indicative of the geneset significance, and its size is indicative of the geneset size.

Gene Ontology (GO)

GOBP<- msigdb_gsets(species = "Mus musculus", 
                    "C5", 
                    "GO:BP",
                    clean = TRUE)
# GOBP
hyp_GOBP <- hypeR(gene_list$uniprot_gn_symbol, 
                  GOBP, 
                  test="hypergeometric", 
                  background=50000, 
                  fdr=0.05, 
                  plotting=TRUE)

#show top pathway
hyp_dots(hyp_GOBP, title="GO: Biological Pathways", abrv=50, val = "fdr")
## Warning: Transformation introduced infinite values in continuous y-axis

Figure 18. Gene Ontology biological pathway visualization. This is referring to the biological role involving each gene. The color insicates the significance of the biologicak pathway, and the size of the dot indicates the geneset size.

Separating Up- and Down- regulated DE Genes

up_degs <- subset(infected_n_sig_data, log2FoldChange > 0)
up_degs <- up_degs$uniprot_gn_symbol
up_degs <- na.omit(up_degs)


down_degs <- subset(infected_n_sig_data, log2FoldChange < 0)
down_degs <- down_degs$uniprot_gn_symbol
down_degs <- na.omit(down_degs)
hyp_KEGG_up <- hypeR(up_degs,
                     KEGG, 
                     test="hypergeometric", 
                     background=50000, 
                     fdr=0.05, 
                     plotting=TRUE)

#show top pathway
#plot_KEGG_up <- 
  hyp_dots(hyp_KEGG_up, title="KEGG Pathways Upregulated in infected samples", abrv=50, val = "fdr")

#ggsave("Dotplot_KEGG_upregulated.png")

Figure 19. Pathway analysis using only upregulated genes.

# KEGG_down
hyp_KEGG_down <- hypeR(down_degs, 
                       KEGG, 
                       test="hypergeometric",
                       background=50000, 
                       fdr=0.05, 
                       plotting=TRUE)

#plot_KEGG_down <- 
hyp_dots(hyp_KEGG_down, title="KEGG Pathways Downregulated in infected samples", abrv=50, val = "fdr")

#ggsave("Dotplot_KEGG_downregulated.png")

Figure 20. Pathway analysis using only downregulated genes.

# THIS WHERE WE AT BUT IT'S BEDTIME
# TODO: Figure out how to get this to run
# TODO: Get the figures in their own folder(?)
sle_vs_n_gost <- gost(query = gene_list$uniprot_gn_symbol, 
                      organism = "mmusculus", 
                      sources = c("GO:BP", "KEGG", "REAC", "WP"),
                      significant = TRUE, 
                      correction_method ="fdr",
                      domain_scope="annotated")

gostplot(sle_vs_n_gost, interactive = TRUE)
# IDK WHAT THIS IS FOR
REACTOME <- msigdb_gsets(species="Mus musculus", 
                         category="C2", 
                         subcategory="CP:REACTOME")